# libraries
import pandas as pd
import numpy as np
import datetime
import itertools
from tqdm.notebook import tqdm
from sklearn.metrics import r2_score
import lightgbm as lgb
import shap
import graphviz
# plotly stuff
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
import plotly.offline as pyo
pyo.init_notebook_mode()
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# configs
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
pd.options.display.max_rows = 50
pd.options.display.max_columns = 500
pd.options.mode.chained_assignment = None # use at your own risk..
pd.__version__,lgb.__version__,shap.__version__
df = pd.read_csv('house_data.csv')
df.date = pd.to_datetime(df.date)
df.yr_built = pd.to_datetime(df.yr_built, format='%Y')
df.yr_renovated = pd.to_datetime(df.yr_renovated, format='%Y', errors='coerce')
df
df.info()
'{0:.2%}'.format(sum(df.yr_renovated.notnull()) / len(df))
Missing values on yr_renovated, which is only logical. Only 4.23% completeness, precise % of expected missing values unknown at this point, will assume it's ok.
df.describe()
for col in df.columns:
px.histogram(df[col]).show()
px.scatter(df.sample(1000), x='sqft_living15', y='sqft_living', trendline='ols')
px.scatter(df.sample(1000), x='sqft_lot15', y='sqft_lot', trendline='ols')
df[df.yr_renovated <= df.yr_built].sort_values('yr_renovated')
Looks fine, no houses renovated before they were built.
df[df.sqft_living != df.sqft_above + df.sqft_basement]
OK, no anomaly on here.
df[df.sqft_living > df.sqft_lot]
px.scatter(df[df.sqft_living > df.sqft_lot], x='sqft_living', y='sqft_lot')
Since living = above + basement checks out, I will assume these datapoints are wrong and clean them as null.
If we simply use the year for built/renovated columns, this will be progressively outdated in the future. We must use the time between that data and the data capture date. If the divisor is zero or NaN and the calculation returns NaN, LightGBM handles just fine.
Since LightGBM is a tree based approach which suffers from both additive and linear limitations, differences and ratios to represent seemingly important relationships can help.
Scores generated from relevant, to overcome additive limitation of LightGBM which can't straightforwardly combine both.
def nullify(df):
df.sqft_lot = np.where(df.sqft_living > df.sqft_lot, np.nan, df.sqft_lot)
return df
def feng(df):
# categorical conversion
df.zipcode = pd.Categorical(df.zipcode)
# built and renovated
df['yr_built_ago'] = (df.date - df.yr_built).dt.days / 365.25
df['yr_renovated_ago'] = (df.date - df.yr_renovated).dt.days / 365.25
df['yr_ren_blt_ago_dif'] = df.yr_renovated_ago - df.yr_built_ago
df['yr_ren_blt_ago_rat'] = df.yr_renovated_ago / df.yr_built_ago
# rooms
df['bath_bed_sum'] = df.bathrooms + df.bedrooms
df['bath_bed_rat'] = df.bathrooms / df.bedrooms
df['bed_floor_rat'] = df.bedrooms / df.floors
df['bath_bed_sum_sqft_living_rat'] = df.sqft_living / df.bath_bed_sum
df['floors_sqft_living_rat'] = df.sqft_living / df.floors
# sqft
df['sqft_basement_rat'] = df.sqft_basement / df.sqft_living
df['sqft_living_rat'] = df.sqft_living / df.sqft_lot
df['sqft_living15_rat'] = df.sqft_living / df.sqft_living15
df['sqft_lot15_rat'] = df.sqft_lot / df.sqft_lot15
# extra points
df['water_view_score'] = df.waterfront * 5 + df.view
df['condition_grade_score'] = df.condition * 2 + df.grade
return df
def exclusions(df):
del df['id']
del df['yr_built']
del df['yr_renovated']
return df
def r2_metric(preds, train_data):
labels = train_data.get_label()
r2 = r2_score(labels,preds)
eval_name = 'r2'
eval_result = r2
is_higher_better = True
return eval_name,eval_result,is_higher_better
def lgbm_model_xv(df, inputs, output, params_grid):
params_df = pd.DataFrame()
tr_ds = lgb.Dataset(df[inputs], label=df[output])
for params in tqdm(params_grid):
cv = lgb.cv(
params = params,
train_set = tr_ds,
num_boost_round = 10000,
early_stopping_rounds = 25,
verbose_eval = False,
stratified = False,
eval_train_metric = True,
feval = r2_metric,
seed = 7,
)
cv = pd.DataFrame(cv).tail(1)
params = pd.DataFrame(params, index=[0])
params['num_boost_round'] = cv.index.values[0] + 1
params['perfo'] = cv.iat[0,2]
params['perfo_train'] = cv.iat[0,0]
params['perfo_vw'] = 2 * params['perfo'] - params['perfo_train'] # this is variance-weighted performance
params = pd.concat([params,cv.reset_index(drop=True)], axis=1)
params_df = params_df.append(params)
return params_df
def get_best_params(params_df, ascending):
params_df = params_df.sort_values('perfo_vw', ascending=ascending).tail(1)
perfo = params_df['perfo'][0]
perfo_vw = params_df['perfo_vw'][0]
params_df = params_df.iloc[:,:params_df.columns.get_loc('perfo')] # get all columns until perfo data comes in
params = params_df.to_dict('index')[0]
return params,perfo,perfo_vw
def train_lgb(df, inputs, params):
rounds = params[0]['num_boost_round']
params[0].pop('num_boost_round', None)
tr_ds = lgb.Dataset(df[inputs], label=df[output])
booster = lgb.train(
params = params[0],
train_set = tr_ds,
num_boost_round = rounds
)
return booster
# read input
df = pd.read_csv('house_data.csv')
df.date = pd.to_datetime(df.date)
df.yr_built = pd.to_datetime(df.yr_built, format='%Y')
df.yr_renovated = pd.to_datetime(df.yr_renovated, format='%Y', errors='coerce')
# general data preparation
df = nullify(df)
df = feng(df)
df = exclusions(df)
# testing flag
df = df.sort_values('date')
df['testing'] = 1
df.testing = df.testing.cumsum()
df.testing = np.where(df.testing < df.testing.max() * 0.8,0,1)
# define inputs and output column
inputs = list(set(df.columns) - set(['date','price','testing']))
output = 'price'
# parameters to explore
params_grid = {
'metric': ['r2_metric'],
'objective': ['mse','mae'], # mae penalizes outliers less, let's see who wins
'num_leaves': [4,8,16,32],
'learning_rate': [0.1],
'feature_fraction': [0.5,1],
'bagging_fraction': [0.5,1],
'bagging_freq': [1],
'verbosity':[-1],
'seed':[7],
}
params_grid = [dict(zip(params_grid.keys(), v)) for v in itertools.product(*params_grid.values())]
# run parameter search
params_df = lgbm_model_xv(df[df.testing==0],inputs,output,params_grid)
params_df = params_df.sort_values('perfo_vw', ascending=False).reset_index(drop=True)
params_df
Let's now use these parameters to train the final model to be tested on the test, out of time sample.
# train chosen model
params = pd.DataFrame(params_df.iloc[4]).transpose().reset_index(drop=True)
params = get_best_params(params, True)
booster = train_lgb(df[df.testing==0], inputs, params)
df_test = df[df.testing==1].copy().reset_index(drop=True)
df_test['price_yhat'] = booster.predict(df_test[inputs])
perfo_tr = params[1]
perfo_te = r2_score(df_test.price,df_test.price_yhat)
print('Training R2:',round(perfo_tr,4))
print('Testing R2:',round(perfo_te,4))
print('Loss:......:','{0:.2%}'.format(1 - perfo_te / perfo_tr))
2.15% loss for out of time testing looks acceptable for me, although more exploration could be done.
mape = df_test[['price','price_yhat']]
mape['percentage_error'] = abs(mape.price - mape.price_yhat) / mape.price
'{0:.2%}'.format(mape.percentage_error.mean())
12.77% MAPE also looks acceptable, although further investigation could be in regards to in what price range is the most error coming from.
px.scatter(df_test, x='price', y='price_yhat', trendline='ols')
Residuals visual analysis shows that the model is doing pretty well overall, as seen before with R2 and MAPE.
Let's see how we can explain our model. Since it's a tree-based algorithm, we can start be taking a look at the first tree.
lgb.create_tree_digraph(booster, tree_index=0)
A simple tree-based algorithm simply takes the features given and tries to split them to minimize the chosen loss function and in turn maximize the chosen performance metric. This tree looks really nice and with some effort we could extract information, but our algorithm is quite complex and actually has 791 trees which are aggregated to compute the final prediction. We can't go through all of them. Therefore, we will use SHAP, a package with a game-theory approach to explaining complex, black box models such as this one.
# shap init
shap.initjs()
explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(df_test[inputs])
We can see at a record level which features are driving changes up or down in price, which has a base value of its own mean at 532k.
shap_plot = shap.force_plot(explainer.expected_value, shap_values[0], df_test[inputs].iloc[0])
shap.save_html('shap_force_single.html', shap_plot)
shap_plot
Overall, all what we are seeing makes sense to what we expect, but this approach allows us to see exactly what are the score components and how much they affect the final score. In this way, a user analyst from the real state can take a look at this 'force-plot' and make the final decision himself, by understanding why the model is settling at a particular value and what information it could be missing.
px.scatter(df_test, x='long', y='lat', color='price_yhat')
What about overall feature impact on price? Let's take a look at a much more complex visualization:
shap_plot = shap.force_plot(explainer.expected_value, shap_values[:1000], df_test[inputs].iloc[:1000,:])
shap.save_html('shap_force_multiple.html', shap_plot)
shap_plot
You can see here lots of things, such as:
Finally, let's take a look at overall feature value based on impact on model output.
shap.summary_plot(shap_values, df_test[inputs], max_display=len(inputs))
Our condition+grade and water+view scores made it pretty far up, good job!